import numpy as np
import pandas as pd
import re
# 数据文件'arxiv-metadata-oai-2019.json'未放入
data = pd.read_json('arxiv-metadata-oai-2019.json', lines=True)

# 阿里天池-零基础入门数据分析-学术前沿趋势分析
# Task-1 论文数量
# 统计2019年全年计算机各个方向论文数量
data_categories_splited = data['categories'].apply(lambda x: x.split())
data['categories'] = data_categories_splited
categories_series = data['categories'].apply(pd.value_counts).fillna(0).sum(axis=0)
print(categories_series[categories_series.index.str[:2] == 'cs'])

# Task-2 论文作者统计
# 统计所有论文作者出现频率Top10的姓名
data_authors_splited = data['authors'].str.replace('and', ',').apply(lambda x: x.split(','))
data['authors'] = data_authors_splited
authors_list = data['authors'].values.tolist()
author_list = []
for authors in authors_list:
    for author in authors:
        author_list.append(author)
print(author_list)